In [1]:
import pickle
import time
import os

import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

pio.renderers.default='notebook'

Results¶

In [2]:
file_name = os.path.join('../../models/experiments/', 'multi-model-BayesSearchCV-2022-03-01-22-52-08.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)

Best Scores/Params¶

In [3]:
results.best_score
Out[3]:
0.7689418281963682
In [4]:
results.best_params
Out[4]:
{'model': 'RandomForestClassifier()',
 'max_features': 0.797397050836895,
 'max_depth': 50,
 'n_estimators': 539,
 'min_samples_split': 29,
 'min_samples_leaf': 4,
 'max_samples': 0.6620299911421869,
 'criterion': 'gini',
 'imputer': "SimpleImputer(strategy='most_frequent')",
 'scaler': 'None',
 'pca': 'None',
 'encoder': 'OneHotEncoder()'}
In [5]:
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')
Out[5]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion imputer scaler pca encoder model_rank
4 1 0.769 0.723 0.814 RandomForestClassifier() NaN 0.797397 50.0 539.0 29.0 4.0 0.66203 gini SimpleImputer(strategy='most_frequent') None None OneHotEncoder() 1.0
3 3 0.763 0.725 0.802 LogisticRegression() NaN NaN NaN NaN NaN NaN NaN NaN SimpleImputer() StandardScaler() None OneHotEncoder() 1.0
In [6]:
results.to_formatted_dataframe(return_style=True,
                               include_rank=True,
                               num_rows=1000)
Out[6]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion imputer scaler pca encoder
1 0.769 0.723 0.814 RandomForestClassifier() <NA> 0.797 50.000 539.000 29.000 4.000 0.662 gini SimpleImputer(strategy='most_frequent') None None OneHotEncoder()
2 0.767 0.720 0.814 RandomForestClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
3 0.763 0.725 0.802 LogisticRegression() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
4 0.762 0.722 0.802 RandomForestClassifier() <NA> 0.249 74.000 729.000 17.000 14.000 0.789 gini SimpleImputer(strategy='most_frequent') None PCA('mle') CustomOrdinalEncoder()
5 0.756 0.712 0.799 RandomForestClassifier() <NA> 0.401 87.000 1,056.000 2.000 11.000 0.691 gini SimpleImputer() None PCA('mle') CustomOrdinalEncoder()
6 0.747 0.714 0.781 LogisticRegression() 32.731 <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() PCA('mle') OneHotEncoder()
7 0.726 0.697 0.755 LogisticRegression() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') StandardScaler() PCA('mle') CustomOrdinalEncoder()
8 0.713 0.672 0.753 LogisticRegression() 0.003 <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() PCA('mle') CustomOrdinalEncoder()
In [7]:
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
Out[7]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion imputer pca encoder
1 0.769 0.723 0.814 0.797 50.000 539.000 29.000 4.000 0.662 gini SimpleImputer(strategy='most_frequent') None OneHotEncoder()
2 0.767 0.720 0.814 <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None OneHotEncoder()
3 0.762 0.722 0.802 0.249 74.000 729.000 17.000 14.000 0.789 gini SimpleImputer(strategy='most_frequent') PCA('mle') CustomOrdinalEncoder()
4 0.756 0.712 0.799 0.401 87.000 1,056.000 2.000 11.000 0.691 gini SimpleImputer() PCA('mle') CustomOrdinalEncoder()
In [8]:
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
Out[8]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI C imputer scaler pca encoder
1 0.763 0.725 0.802 <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
2 0.747 0.714 0.781 32.731 SimpleImputer() StandardScaler() PCA('mle') OneHotEncoder()
3 0.726 0.697 0.755 0.000 SimpleImputer(strategy='most_frequent') StandardScaler() PCA('mle') CustomOrdinalEncoder()
4 0.713 0.672 0.753 0.003 SimpleImputer(strategy='median') MinMaxScaler() PCA('mle') CustomOrdinalEncoder()

BayesSearchCV Performance Over Time¶

In [9]:
results.plot_performance_across_trials(facet_by='model').show()
In [10]:
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()

Variable Performance Over Time¶

In [11]:
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()

Scatter Matrix¶

In [12]:
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
#                             height=1000, width=1000).show()

Variable Performance - Numeric¶

In [13]:
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
                                        height=800)
In [14]:
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()

Variable Performance - Non-Numeric¶

In [15]:
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()

In [16]:
results.plot_score_vs_parameter(
    query='model == "RandomForestClassifier()"',
    parameter='max_features',
    size='max_depth',
    color='encoder',
)

In [17]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='max_depth'
# )
In [18]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='imputer'
# )